import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df_2015 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2015_YTD.csv')
df_2016 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2016_YTD.csv')
df_2017 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2017_YTD.csv')
df_2018 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2018_YTD.csv')
df_2019 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2019_YTD.csv')
df_2020 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2020_YTD.csv')
df_2021 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2021_YTD.csv')
df_2022 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2022_YTD.csv')
df_2023 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2023_YTD.csv')
df_2015.info(),df_2016.info(),df_2017.info(),df_2018.info(),df_2019.info(),df_2020.info(),df_2021.info(),df_2022.info(),df_2023.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8760 entries, 0 to 8759 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Site 8760 non-null object 1 Parameter 8760 non-null object 2 Date (LT) 8760 non-null object 3 Year 8760 non-null int64 4 Month 8760 non-null int64 5 Day 8760 non-null int64 6 Hour 8760 non-null int64 7 NowCast Conc. 8760 non-null float64 8 AQI 8760 non-null int64 9 AQI Category 259 non-null object 10 Raw Conc. 8760 non-null float64 11 Conc. Unit 8760 non-null object 12 Duration 8760 non-null object 13 QC Name 8760 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 958.2+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 8784 entries, 0 to 8783 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Site 8784 non-null object 1 Parameter 8784 non-null object 2 Date (LT) 8784 non-null object 3 Year 8784 non-null int64 4 Month 8784 non-null int64 5 Day 8784 non-null int64 6 Hour 8784 non-null int64 7 NowCast Conc. 8784 non-null float64 8 AQI 8784 non-null int64 9 AQI Category 8705 non-null object 10 Raw Conc. 8784 non-null float64 11 Conc. Unit 8784 non-null object 12 Duration 8784 non-null object 13 QC Name 8784 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 960.9+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 8760 entries, 0 to 8759 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Site 8760 non-null object 1 Parameter 8760 non-null object 2 Date (LT) 8760 non-null object 3 Year 8760 non-null int64 4 Month 8760 non-null int64 5 Day 8760 non-null int64 6 Hour 8760 non-null int64 7 NowCast Conc. 8760 non-null float64 8 AQI 8760 non-null int64 9 AQI Category 8085 non-null object 10 Raw Conc. 8760 non-null float64 11 Conc. Unit 8760 non-null object 12 Duration 8760 non-null object 13 QC Name 8760 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 958.2+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 8370 entries, 0 to 8369 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Site 8370 non-null object 1 Parameter 8370 non-null object 2 Date (LT) 8370 non-null object 3 Year 8370 non-null int64 4 Month 8370 non-null int64 5 Day 8370 non-null int64 6 Hour 8370 non-null int64 7 NowCast Conc. 8370 non-null float64 8 AQI 8370 non-null int64 9 AQI Category 7921 non-null object 10 Raw Conc. 8370 non-null float64 11 Conc. Unit 8370 non-null object 12 Duration 8370 non-null object 13 QC Name 8370 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 915.6+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 8289 entries, 0 to 8288 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Site 8289 non-null object 1 Parameter 8289 non-null object 2 Date (LT) 8289 non-null object 3 Year 8289 non-null int64 4 Month 8289 non-null int64 5 Day 8289 non-null int64 6 Hour 8289 non-null int64 7 NowCast Conc. 8289 non-null float64 8 AQI 8289 non-null int64 9 AQI Category 8271 non-null object 10 Raw Conc. 8289 non-null float64 11 Conc. Unit 8289 non-null object 12 Duration 8289 non-null object 13 QC Name 8289 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 906.7+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 8598 entries, 0 to 8597 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Site 8598 non-null object 1 Parameter 8598 non-null object 2 Date (LT) 8598 non-null object 3 Year 8598 non-null int64 4 Month 8598 non-null int64 5 Day 8598 non-null int64 6 Hour 8598 non-null int64 7 NowCast Conc. 8598 non-null float64 8 AQI 8598 non-null int64 9 AQI Category 8438 non-null object 10 Raw Conc. 8598 non-null float64 11 Conc. Unit 8598 non-null object 12 Duration 8598 non-null object 13 QC Name 8598 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 940.5+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 8542 entries, 0 to 8541 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Site 8542 non-null object 1 Parameter 8542 non-null object 2 Date (LT) 8542 non-null object 3 Year 8542 non-null int64 4 Month 8542 non-null int64 5 Day 8542 non-null int64 6 Hour 8542 non-null int64 7 NowCast Conc. 8542 non-null float64 8 AQI 8542 non-null int64 9 AQI Category 8536 non-null object 10 Raw Conc. 8542 non-null float64 11 Conc. Unit 8542 non-null object 12 Duration 8542 non-null object 13 QC Name 8542 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 934.4+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 8268 entries, 0 to 8267 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Site 8268 non-null object 1 Parameter 8268 non-null object 2 Date (LT) 8268 non-null object 3 Year 8268 non-null int64 4 Month 8268 non-null int64 5 Day 8268 non-null int64 6 Hour 8268 non-null int64 7 NowCast Conc. 8268 non-null float64 8 AQI 8268 non-null int64 9 AQI Category 8234 non-null object 10 Raw Conc. 8268 non-null float64 11 Conc. Unit 8268 non-null object 12 Duration 8268 non-null object 13 QC Name 8268 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 904.4+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 3388 entries, 0 to 3387 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Site 3388 non-null object 1 Parameter 3388 non-null object 2 Date (LT) 3388 non-null object 3 Year 3388 non-null int64 4 Month 3388 non-null int64 5 Day 3388 non-null int64 6 Hour 3388 non-null int64 7 NowCast Conc. 3388 non-null float64 8 AQI 3388 non-null int64 9 AQI Category 3383 non-null object 10 Raw Conc. 3388 non-null float64 11 Conc. Unit 3388 non-null object 12 Duration 3388 non-null object 13 QC Name 3388 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 370.7+ KB
(None, None, None, None, None, None, None, None, None)
df_2015.head(),df_2016.head(),df_2017.head(),df_2018.head(),df_2019.head(),df_2020.head(),df_2021.head(),df_2022.head(),df_2023.head()
( Site Parameter Date (LT) Year Month Day \
0 Jakarta South PM2.5 - Principal 2015-01-01 01:00 AM 2015 1 1
1 Jakarta South PM2.5 - Principal 2015-01-01 02:00 AM 2015 1 1
2 Jakarta South PM2.5 - Principal 2015-01-01 03:00 AM 2015 1 1
3 Jakarta South PM2.5 - Principal 2015-01-01 04:00 AM 2015 1 1
4 Jakarta South PM2.5 - Principal 2015-01-01 05:00 AM 2015 1 1
Hour NowCast Conc. AQI AQI Category Raw Conc. Conc. Unit Duration \
0 1 -999.0 -999 NaN -999.0 UG/M3 1 Hr
1 2 -999.0 -999 NaN -999.0 UG/M3 1 Hr
2 3 -999.0 -999 NaN -999.0 UG/M3 1 Hr
3 4 -999.0 -999 NaN -999.0 UG/M3 1 Hr
4 5 -999.0 -999 NaN -999.0 UG/M3 1 Hr
QC Name
0 Missing
1 Missing
2 Missing
3 Missing
4 Missing ,
Site Parameter Date (LT) Year Month Day \
0 Jakarta South PM2.5 - Principal 2016-01-01 01:00 AM 2016 1 1
1 Jakarta South PM2.5 - Principal 2016-01-01 02:00 AM 2016 1 1
2 Jakarta South PM2.5 - Principal 2016-01-01 03:00 AM 2016 1 1
3 Jakarta South PM2.5 - Principal 2016-01-01 04:00 AM 2016 1 1
4 Jakarta South PM2.5 - Principal 2016-01-01 05:00 AM 2016 1 1
Hour NowCast Conc. AQI AQI Category Raw Conc. \
0 1 80.2 164 Unhealthy 85.0
1 2 82.6 165 Unhealthy 85.0
2 3 49.3 135 Unhealthy for Sensitive Groups 16.0
3 4 33.1 95 Moderate 17.0
4 5 33.0 95 Moderate 33.0
Conc. Unit Duration QC Name
0 UG/M3 1 Hr Valid
1 UG/M3 1 Hr Valid
2 UG/M3 1 Hr Valid
3 UG/M3 1 Hr Valid
4 UG/M3 1 Hr Valid ,
Site Parameter Date (LT) Year Month Day \
0 Jakarta South PM2.5 - Principal 2017-01-01 01:00 AM 2017 1 1
1 Jakarta South PM2.5 - Principal 2017-01-01 02:00 AM 2017 1 1
2 Jakarta South PM2.5 - Principal 2017-01-01 03:00 AM 2017 1 1
3 Jakarta South PM2.5 - Principal 2017-01-01 04:00 AM 2017 1 1
4 Jakarta South PM2.5 - Principal 2017-01-01 05:00 AM 2017 1 1
Hour NowCast Conc. AQI AQI Category Raw Conc. \
0 1 54.4 148 Unhealthy for Sensitive Groups 115.0
1 2 58.1 152 Unhealthy 60.0
2 3 47.2 130 Unhealthy for Sensitive Groups 39.0
3 4 43.8 121 Unhealthy for Sensitive Groups 41.0
4 5 40.8 114 Unhealthy for Sensitive Groups 38.0
Conc. Unit Duration QC Name
0 UG/M3 1 Hr Invalid
1 UG/M3 1 Hr Valid
2 UG/M3 1 Hr Valid
3 UG/M3 1 Hr Valid
4 UG/M3 1 Hr Valid ,
Site Parameter Date (LT) Year Month Day \
0 Jakarta South PM2.5 - Principal 2018-01-01 01:00 AM 2018 1 1
1 Jakarta South PM2.5 - Principal 2018-01-01 02:00 AM 2018 1 1
2 Jakarta South PM2.5 - Principal 2018-01-01 03:00 AM 2018 1 1
3 Jakarta South PM2.5 - Principal 2018-01-01 04:00 AM 2018 1 1
4 Jakarta South PM2.5 - Principal 2018-01-01 05:00 AM 2018 1 1
Hour NowCast Conc. AQI AQI Category Raw Conc. \
0 1 76.5 162 Unhealthy 97.0
1 2 79.2 163 Unhealthy 82.0
2 3 58.1 152 Unhealthy 37.0
3 4 38.5 108 Unhealthy for Sensitive Groups 19.0
4 5 20.7 69 Moderate 3.0
Conc. Unit Duration QC Name
0 UG/M3 1 Hr Valid
1 UG/M3 1 Hr Valid
2 UG/M3 1 Hr Valid
3 UG/M3 1 Hr Valid
4 UG/M3 1 Hr Valid ,
Site Parameter Date (LT) Year Month Day \
0 Jakarta South PM2.5 - Principal 2019-01-01 01:00 AM 2019 1 1
1 Jakarta South PM2.5 - Principal 2019-01-01 02:00 AM 2019 1 1
2 Jakarta South PM2.5 - Principal 2019-01-01 03:00 AM 2019 1 1
3 Jakarta South PM2.5 - Principal 2019-01-01 04:00 AM 2019 1 1
4 Jakarta South PM2.5 - Principal 2019-01-01 05:00 AM 2019 1 1
Hour NowCast Conc. AQI AQI Category Raw Conc. Conc. Unit Duration \
0 1 22.0 72 Moderate 29.0 UG/M3 1 Hr
1 2 11.5 48 Good 1.0 UG/M3 1 Hr
2 3 10.2 43 Good 9.0 UG/M3 1 Hr
3 4 7.6 32 Good 5.0 UG/M3 1 Hr
4 5 6.3 26 Good 5.0 UG/M3 1 Hr
QC Name
0 Valid
1 Valid
2 Valid
3 Valid
4 Valid ,
Site Parameter Date (LT) Year Month Day \
0 Jakarta South PM2.5 - Principal 2020-01-01 01:00 AM 2020 1 1
1 Jakarta South PM2.5 - Principal 2020-01-01 02:00 AM 2020 1 1
2 Jakarta South PM2.5 - Principal 2020-01-01 03:00 AM 2020 1 1
3 Jakarta South PM2.5 - Principal 2020-01-01 04:00 AM 2020 1 1
4 Jakarta South PM2.5 - Principal 2020-01-01 05:00 AM 2020 1 1
Hour NowCast Conc. AQI AQI Category Raw Conc. \
0 1 65.3 156 Unhealthy 59.0
1 2 49.6 136 Unhealthy for Sensitive Groups 34.0
2 3 32.3 93 Moderate 15.0
3 4 24.6 77 Moderate 17.0
4 5 20.3 68 Moderate 16.0
Conc. Unit Duration QC Name
0 UG/M3 1 Hr Valid
1 UG/M3 1 Hr Valid
2 UG/M3 1 Hr Valid
3 UG/M3 1 Hr Valid
4 UG/M3 1 Hr Valid ,
Site Parameter Date (LT) Year Month Day \
0 Jakarta South PM2.5 - Principal 2021-01-01 01:00 AM 2021 1 1
1 Jakarta South PM2.5 - Principal 2021-01-01 02:00 AM 2021 1 1
2 Jakarta South PM2.5 - Principal 2021-01-01 03:00 AM 2021 1 1
3 Jakarta South PM2.5 - Principal 2021-01-01 04:00 AM 2021 1 1
4 Jakarta South PM2.5 - Principal 2021-01-01 05:00 AM 2021 1 1
Hour NowCast Conc. AQI AQI Category Raw Conc. \
0 1 43.7 121 Unhealthy for Sensitive Groups 47.0
1 2 38.3 108 Unhealthy for Sensitive Groups 33.0
2 3 32.6 94 Moderate 27.0
3 4 29.3 87 Moderate 26.0
4 5 22.6 73 Moderate 16.0
Conc. Unit Duration QC Name
0 UG/M3 1 Hr Valid
1 UG/M3 1 Hr Valid
2 UG/M3 1 Hr Valid
3 UG/M3 1 Hr Valid
4 UG/M3 1 Hr Valid ,
Site Parameter Date (LT) Year Month Day \
0 Jakarta South PM2.5 - Principal 2022-01-01 01:00 AM 2022 1 1
1 Jakarta South PM2.5 - Principal 2022-01-01 02:00 AM 2022 1 1
2 Jakarta South PM2.5 - Principal 2022-01-01 03:00 AM 2022 1 1
3 Jakarta South PM2.5 - Principal 2022-01-01 04:00 AM 2022 1 1
4 Jakarta South PM2.5 - Principal 2022-01-01 05:00 AM 2022 1 1
Hour NowCast Conc. AQI AQI Category Raw Conc. \
0 1 53.7 146 Unhealthy for Sensitive Groups 69.0
1 2 42.3 118 Unhealthy for Sensitive Groups 31.0
2 3 32.1 93 Moderate 22.0
3 4 20.0 68 Moderate 8.0
4 5 15.0 57 Moderate 10.0
Conc. Unit Duration QC Name
0 UG/M3 1 Hr Valid
1 UG/M3 1 Hr Valid
2 UG/M3 1 Hr Valid
3 UG/M3 1 Hr Valid
4 UG/M3 1 Hr Valid ,
Site Parameter Date (LT) Year Month Day \
0 Jakarta South PM2.5 - Principal 2023-01-01 01:00 AM 2023 1 1
1 Jakarta South PM2.5 - Principal 2023-01-01 02:00 AM 2023 1 1
2 Jakarta South PM2.5 - Principal 2023-01-01 03:00 AM 2023 1 1
3 Jakarta South PM2.5 - Principal 2023-01-01 04:00 AM 2023 1 1
4 Jakarta South PM2.5 - Principal 2023-01-01 05:00 AM 2023 1 1
Hour NowCast Conc. AQI AQI Category Raw Conc. \
0 1 44.7 124 Unhealthy for Sensitive Groups 50.0
1 2 29.3 87 Moderate 14.0
2 3 20.1 68 Moderate 11.0
3 4 16.5 60 Moderate 13.0
4 5 14.2 55 Moderate 12.0
Conc. Unit Duration QC Name
0 UG/M3 1 Hr Valid
1 UG/M3 1 Hr Valid
2 UG/M3 1 Hr Valid
3 UG/M3 1 Hr Valid
4 UG/M3 1 Hr Valid )
df_2015.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2016.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2017.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2018.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2019.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2020.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2021.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2022.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2023.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
#check data out-of-range
df_2015.loc[(df_2015['NowCast_Conc']<0)|(df_2015['AQI']<0)|(df_2015['Raw_Conc']<=0)].count()
Site 8505 Parameter 8505 Date_LT 8505 Year 8505 Month 8505 Day 8505 Hour 8505 NowCast_Conc 8505 AQI 8505 AQI_Category 4 Raw_Conc 8505 Conc_Unit 8505 Duration 8505 QC_Name 8505 dtype: int64
#check data out-of-range
df_2016.loc[(df_2016['NowCast_Conc']<0)|(df_2016['AQI']<0)|(df_2016['Raw_Conc']<=0)].count()
Site 127 Parameter 127 Date_LT 127 Year 127 Month 127 Day 127 Hour 127 NowCast_Conc 127 AQI 127 AQI_Category 48 Raw_Conc 127 Conc_Unit 127 Duration 127 QC_Name 127 dtype: int64
#check data out-of-range
df_2017.loc[(df_2017['NowCast_Conc']<0)|(df_2017['AQI']<0)|(df_2017['Raw_Conc']<=0)].count()
Site 849 Parameter 849 Date_LT 849 Year 849 Month 849 Day 849 Hour 849 NowCast_Conc 849 AQI 849 AQI_Category 174 Raw_Conc 849 Conc_Unit 849 Duration 849 QC_Name 849 dtype: int64
#check data out-of-range
df_2018.loc[(df_2018['NowCast_Conc']<0)|(df_2018['AQI']<0)|(df_2018['Raw_Conc']<=0)].count()
Site 535 Parameter 535 Date_LT 535 Year 535 Month 535 Day 535 Hour 535 NowCast_Conc 535 AQI 535 AQI_Category 86 Raw_Conc 535 Conc_Unit 535 Duration 535 QC_Name 535 dtype: int64
#check data out-of-range
df_2019.loc[(df_2019['NowCast_Conc']<0)|(df_2019['AQI']<0)|(df_2019['Raw_Conc']<=0)].count()
Site 49 Parameter 49 Date_LT 49 Year 49 Month 49 Day 49 Hour 49 NowCast_Conc 49 AQI 49 AQI_Category 31 Raw_Conc 49 Conc_Unit 49 Duration 49 QC_Name 49 dtype: int64
#check data out-of-range
df_2020.loc[(df_2020['NowCast_Conc']<0)|(df_2020['AQI']<0)|(df_2020['Raw_Conc']<=0)].count()
Site 210 Parameter 210 Date_LT 210 Year 210 Month 210 Day 210 Hour 210 NowCast_Conc 210 AQI 210 AQI_Category 50 Raw_Conc 210 Conc_Unit 210 Duration 210 QC_Name 210 dtype: int64
#check data out-of-range
df_2021.loc[(df_2021['NowCast_Conc']<0)|(df_2021['AQI']<0)|(df_2021['Raw_Conc']<=0)].count()
Site 21 Parameter 21 Date_LT 21 Year 21 Month 21 Day 21 Hour 21 NowCast_Conc 21 AQI 21 AQI_Category 15 Raw_Conc 21 Conc_Unit 21 Duration 21 QC_Name 21 dtype: int64
#check data out-of-range
df_2022.loc[(df_2022['NowCast_Conc']<0)|(df_2022['AQI']<0)|(df_2022['Raw_Conc']<=0)].count()
Site 50 Parameter 50 Date_LT 50 Year 50 Month 50 Day 50 Hour 50 NowCast_Conc 50 AQI 50 AQI_Category 16 Raw_Conc 50 Conc_Unit 50 Duration 50 QC_Name 50 dtype: int64
#check data out-of-range
df_2023.loc[(df_2023['NowCast_Conc']<0)|(df_2023['AQI']<0)|(df_2023['Raw_Conc']<=0)].count()
Site 15 Parameter 15 Date_LT 15 Year 15 Month 15 Day 15 Hour 15 NowCast_Conc 15 AQI 15 AQI_Category 10 Raw_Conc 15 Conc_Unit 15 Duration 15 QC_Name 15 dtype: int64
#drop invalid values
# List of DataFrame names for each year
df_names = ['df_2015', 'df_2016', 'df_2017', 'df_2018', 'df_2019', 'df_2020', 'df_2021', 'df_2022', 'df_2023']
for df_name in df_names:
df_year = globals()[df_name] # Access the DataFrame by its name using globals()
df_year = df_year.drop(df_year[(df_year['NowCast_Conc'] < 0) | (df_year['AQI'] < 0) | (df_year['Raw_Conc'] <= 0)].index)
df_year = df_year.reset_index(drop=True)
globals()[df_name] = df_year # Update the DataFrame with the filtered data
# merging original dataset
df_merged = pd.merge(df_2015,df_2016, how = 'outer').merge(df_2017, how = 'outer').merge(df_2018, how = 'outer').merge(df_2019, how = 'outer').merge(df_2020, how = 'outer').merge(df_2021, how = 'outer').merge(df_2022, how = 'outer').merge(df_2023, how = 'outer')
df_merged
| Site | Parameter | Date_LT | Year | Month | Day | Hour | NowCast_Conc | AQI | AQI_Category | Raw_Conc | Conc_Unit | Duration | QC_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Jakarta South | PM2.5 - Principal | 2015-11-20 09:00 PM | 2015 | 11 | 20 | 21 | 42.9 | 119 | Unhealthy for Sensitive Groups | 50.0 | UG/M3 | 1 Hr | Valid |
| 1 | Jakarta South | PM2.5 - Principal | 2015-11-20 10:00 PM | 2015 | 11 | 20 | 22 | 32.3 | 93 | Moderate | 24.0 | UG/M3 | 1 Hr | Valid |
| 2 | Jakarta South | PM2.5 - Principal | 2015-11-20 11:00 PM | 2015 | 11 | 20 | 23 | 29.5 | 88 | Moderate | 27.0 | UG/M3 | 1 Hr | Valid |
| 3 | Jakarta South | PM2.5 - Principal | 2015-11-21 12:00 AM | 2015 | 11 | 21 | 0 | 30.8 | 90 | Moderate | 32.0 | UG/M3 | 1 Hr | Valid |
| 4 | Jakarta South | PM2.5 - Principal | 2015-11-21 01:00 AM | 2015 | 11 | 21 | 1 | 31.4 | 92 | Moderate | 32.0 | UG/M3 | 1 Hr | Valid |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 61393 | Jakarta South | PM2.5 - Principal | 2023-05-22 08:00 AM | 2023 | 5 | 22 | 8 | 62.5 | 155 | Unhealthy | 64.0 | UG/M3 | 1 Hr | Valid |
| 61394 | Jakarta South | PM2.5 - Principal | 2023-05-22 09:00 AM | 2023 | 5 | 22 | 9 | 55.9 | 151 | Unhealthy | 49.0 | UG/M3 | 1 Hr | Valid |
| 61395 | Jakarta South | PM2.5 - Principal | 2023-05-22 10:00 AM | 2023 | 5 | 22 | 10 | 51.1 | 139 | Unhealthy for Sensitive Groups | 46.0 | UG/M3 | 1 Hr | Valid |
| 61396 | Jakarta South | PM2.5 - Principal | 2023-05-22 11:00 AM | 2023 | 5 | 22 | 11 | 51.4 | 140 | Unhealthy for Sensitive Groups | 51.0 | UG/M3 | 1 Hr | Valid |
| 61397 | Jakarta South | PM2.5 - Principal | 2023-05-22 12:00 PM | 2023 | 5 | 22 | 12 | 51.2 | 140 | Unhealthy for Sensitive Groups | 51.0 | UG/M3 | 1 Hr | Valid |
61398 rows × 14 columns
# Dictionary to store the sampled data for each year
sampled_data = {}
# Iterate over each DataFrame
for df_name in df_names:
df_year = globals()[df_name] # Access the DataFrame by its name using globals()
# Select data points at 2 AM, 8 AM, 2 PM, and 8 PM
selected_hours = [2, 8, 14, 20]
df_6hr = df_year[df_year['Hour'].isin(selected_hours)]
sampled_data[df_name] = df_6hr # Store the sampled data in the dictionary
# Access the sampled data for a specific year
df_2015_6hr = sampled_data['df_2015']
df_2016_6hr = sampled_data['df_2016']
df_2017_6hr = sampled_data['df_2017']
df_2018_6hr = sampled_data['df_2018']
df_2019_6hr = sampled_data['df_2019']
df_2020_6hr = sampled_data['df_2020']
df_2021_6hr = sampled_data['df_2021']
df_2022_6hr = sampled_data['df_2022']
df_2023_6hr = sampled_data['df_2023']
df_6hour = pd.merge(df_2015_6hr,df_2016_6hr, how = 'outer').merge(df_2017_6hr, how = 'outer').merge(df_2018_6hr, how = 'outer').merge(df_2019_6hr, how = 'outer').merge(df_2020_6hr, how = 'outer').merge(df_2021_6hr, how = 'outer').merge(df_2022_6hr, how = 'outer').merge(df_2023_6hr, how = 'outer')
df_6hour
| Site | Parameter | Date_LT | Year | Month | Day | Hour | NowCast_Conc | AQI | AQI_Category | Raw_Conc | Conc_Unit | Duration | QC_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Jakarta South | PM2.5 - Principal | 2015-11-21 02:00 AM | 2015 | 11 | 21 | 2 | 31.7 | 92 | Moderate | 32.0 | UG/M3 | 1 Hr | Valid |
| 1 | Jakarta South | PM2.5 - Principal | 2015-11-21 08:00 AM | 2015 | 11 | 21 | 8 | 54.2 | 147 | Unhealthy for Sensitive Groups | 58.0 | UG/M3 | 1 Hr | Valid |
| 2 | Jakarta South | PM2.5 - Principal | 2015-12-22 02:00 AM | 2015 | 12 | 22 | 2 | 23.0 | 74 | Moderate | 23.0 | UG/M3 | 1 Hr | Valid |
| 3 | Jakarta South | PM2.5 - Principal | 2015-12-22 08:00 AM | 2015 | 12 | 22 | 8 | 32.5 | 94 | Moderate | 34.0 | UG/M3 | 1 Hr | Valid |
| 4 | Jakarta South | PM2.5 - Principal | 2015-12-22 02:00 PM | 2015 | 12 | 22 | 14 | 11.7 | 49 | Good | 10.0 | UG/M3 | 1 Hr | Valid |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10247 | Jakarta South | PM2.5 - Principal | 2023-05-21 08:00 AM | 2023 | 5 | 21 | 8 | 77.3 | 162 | Unhealthy | 72.0 | UG/M3 | 1 Hr | Valid |
| 10248 | Jakarta South | PM2.5 - Principal | 2023-05-21 02:00 PM | 2023 | 5 | 21 | 14 | 48.1 | 132 | Unhealthy for Sensitive Groups | 46.0 | UG/M3 | 1 Hr | Valid |
| 10249 | Jakarta South | PM2.5 - Principal | 2023-05-21 08:00 PM | 2023 | 5 | 21 | 20 | 39.8 | 112 | Unhealthy for Sensitive Groups | 36.0 | UG/M3 | 1 Hr | Valid |
| 10250 | Jakarta South | PM2.5 - Principal | 2023-05-22 02:00 AM | 2023 | 5 | 22 | 2 | 39.2 | 110 | Unhealthy for Sensitive Groups | 43.0 | UG/M3 | 1 Hr | Valid |
| 10251 | Jakarta South | PM2.5 - Principal | 2023-05-22 08:00 AM | 2023 | 5 | 22 | 8 | 62.5 | 155 | Unhealthy | 64.0 | UG/M3 | 1 Hr | Valid |
10252 rows × 14 columns
df_6hour.head()
| Site | Parameter | Date_LT | Year | Month | Day | Hour | NowCast_Conc | AQI | AQI_Category | Raw_Conc | Conc_Unit | Duration | QC_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Jakarta South | PM2.5 - Principal | 2015-11-21 02:00 AM | 2015 | 11 | 21 | 2 | 31.7 | 92 | Moderate | 32.0 | UG/M3 | 1 Hr | Valid |
| 1 | Jakarta South | PM2.5 - Principal | 2015-11-21 08:00 AM | 2015 | 11 | 21 | 8 | 54.2 | 147 | Unhealthy for Sensitive Groups | 58.0 | UG/M3 | 1 Hr | Valid |
| 2 | Jakarta South | PM2.5 - Principal | 2015-12-22 02:00 AM | 2015 | 12 | 22 | 2 | 23.0 | 74 | Moderate | 23.0 | UG/M3 | 1 Hr | Valid |
| 3 | Jakarta South | PM2.5 - Principal | 2015-12-22 08:00 AM | 2015 | 12 | 22 | 8 | 32.5 | 94 | Moderate | 34.0 | UG/M3 | 1 Hr | Valid |
| 4 | Jakarta South | PM2.5 - Principal | 2015-12-22 02:00 PM | 2015 | 12 | 22 | 14 | 11.7 | 49 | Good | 10.0 | UG/M3 | 1 Hr | Valid |
df_6hour.tail()
| Site | Parameter | Date_LT | Year | Month | Day | Hour | NowCast_Conc | AQI | AQI_Category | Raw_Conc | Conc_Unit | Duration | QC_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10247 | Jakarta South | PM2.5 - Principal | 2023-05-21 08:00 AM | 2023 | 5 | 21 | 8 | 77.3 | 162 | Unhealthy | 72.0 | UG/M3 | 1 Hr | Valid |
| 10248 | Jakarta South | PM2.5 - Principal | 2023-05-21 02:00 PM | 2023 | 5 | 21 | 14 | 48.1 | 132 | Unhealthy for Sensitive Groups | 46.0 | UG/M3 | 1 Hr | Valid |
| 10249 | Jakarta South | PM2.5 - Principal | 2023-05-21 08:00 PM | 2023 | 5 | 21 | 20 | 39.8 | 112 | Unhealthy for Sensitive Groups | 36.0 | UG/M3 | 1 Hr | Valid |
| 10250 | Jakarta South | PM2.5 - Principal | 2023-05-22 02:00 AM | 2023 | 5 | 22 | 2 | 39.2 | 110 | Unhealthy for Sensitive Groups | 43.0 | UG/M3 | 1 Hr | Valid |
| 10251 | Jakarta South | PM2.5 - Principal | 2023-05-22 08:00 AM | 2023 | 5 | 22 | 8 | 62.5 | 155 | Unhealthy | 64.0 | UG/M3 | 1 Hr | Valid |
df_6hour.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 10252 entries, 0 to 10251 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Site 10252 non-null object 1 Parameter 10252 non-null object 2 Date_LT 10252 non-null object 3 Year 10252 non-null int64 4 Month 10252 non-null int64 5 Day 10252 non-null int64 6 Hour 10252 non-null int64 7 NowCast_Conc 10252 non-null float64 8 AQI 10252 non-null int64 9 AQI_Category 10252 non-null object 10 Raw_Conc 10252 non-null float64 11 Conc_Unit 10252 non-null object 12 Duration 10252 non-null object 13 QC_Name 10252 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 1.2+ MB
df_6hour.shape
(10252, 14)
df_6hour.isnull().sum()
Site 0 Parameter 0 Date_LT 0 Year 0 Month 0 Day 0 Hour 0 NowCast_Conc 0 AQI 0 AQI_Category 0 Raw_Conc 0 Conc_Unit 0 Duration 0 QC_Name 0 dtype: int64
from IPython.display import FileLink
# Save DataFrame to CSV file
df_6hour.to_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Clean_JakartaSouth_PM2.5_6HR_YTD.csv', index=False)
df_6hour.describe()
| Year | Month | Day | Hour | NowCast_Conc | AQI | Raw_Conc | |
|---|---|---|---|---|---|---|---|
| count | 10252.00000 | 10252.000000 | 10252.000000 | 10252.000000 | 10252.000000 | 10252.000000 | 10252.000000 |
| mean | 2019.20835 | 6.390753 | 15.725029 | 11.012876 | 40.687651 | 107.522532 | 41.218104 |
| std | 2.17036 | 3.464111 | 8.779137 | 6.715321 | 22.398155 | 39.903786 | 24.150764 |
| min | 2015.00000 | 1.000000 | 1.000000 | 2.000000 | 0.100000 | 0.000000 | 1.000000 |
| 25% | 2017.00000 | 3.000000 | 8.000000 | 8.000000 | 23.800000 | 76.000000 | 23.000000 |
| 50% | 2019.00000 | 6.000000 | 16.000000 | 8.000000 | 37.500000 | 106.000000 | 37.000000 |
| 75% | 2021.00000 | 9.000000 | 23.000000 | 20.000000 | 53.600000 | 146.000000 | 55.000000 |
| max | 2023.00000 | 12.000000 | 31.000000 | 20.000000 | 182.800000 | 233.000000 | 187.000000 |
#Plot a graph to show the peak of Raw conc.
# Group data by hour and calculate average raw concentration
hourly_avg = df_merged.groupby('Hour')['Raw_Conc'].mean()
# Create line graph
plt.plot(range(24), hourly_avg.values)
plt.xticks(range(24))
plt.xlabel('Hour of the day')
plt.ylabel('Average Raw_Conc')
plt.title('Average Raw concentration by Hour')
plt.grid(True)
plt.show()
# Group data by hour and calculate average raw concentration
hourly_avg = df_6hour.groupby('Hour')['Raw_Conc'].mean()
# Create line graph
plt.plot(hourly_avg.values)
plt.xticks(np.arange(4), ['2am', '8am', '14pm', '20pm'])
plt.xlabel('Every 6 Hour of the day')
plt.ylabel('Average Raw_Conc')
plt.title('Average Raw concentration by every 6 Hour')
plt.grid(True)
plt.show()
sns.boxplot(x=df_6hour.AQI)
plt.show()
# Calculate the IQR
Q1 = np.percentile(df_6hour['AQI'], 25)
Q3 = np.percentile(df_6hour['AQI'], 75)
IQR = Q3 - Q1
# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Count the number of outliers
outliers = df_6hour[(df_6hour['AQI'] < lower_bound) | (df_6hour['AQI'] > upper_bound)]
num_outliers = len(outliers)
print("Q1 is", Q1)
print("Q3 is", Q3)
print("lower_bound is", lower_bound)
print("upper_bound is", upper_bound)
# Print the number of outliers
print("Number of outliers for AQI:", num_outliers)
Q1 is 76.0 Q3 is 146.0 lower_bound is -29.0 upper_bound is 251.0 Number of outliers for AQI: 0
sns.boxplot(x=df_6hour.NowCast_Conc)
plt.show()
# Calculate the IQR
Q1 = np.percentile(df_6hour['NowCast_Conc'], 25)
Q3 = np.percentile(df_6hour['NowCast_Conc'], 75)
IQR = Q3 - Q1
# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Count the number of outliers
outliers = df_6hour[(df_6hour['NowCast_Conc'] < lower_bound) | (df_6hour['NowCast_Conc'] > upper_bound)]
num_outliers = len(outliers)
# Print the number of outliers
print("Q1 is", Q1)
print("Q3 is", Q3)
print("lower_bound is", lower_bound)
print("upper_bound is", upper_bound)
print("Number of outliers for NowCast_Conc:", num_outliers)
Q1 is 23.8 Q3 is 53.6 lower_bound is -20.900000000000002 upper_bound is 98.30000000000001 Number of outliers for NowCast_Conc: 179
sns.boxplot(x=df_6hour.Raw_Conc)
plt.show()
# Calculate the IQR
Q1 = np.percentile(df_6hour['Raw_Conc'], 25)
Q3 = np.percentile(df_6hour['Raw_Conc'], 75)
IQR = Q3 - Q1
# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Count the number of outliers
outliers = df_6hour[(df_6hour['Raw_Conc'] < lower_bound) | (df_6hour['Raw_Conc'] > upper_bound)]
num_outliers = len(outliers)
# Print the number of outliers
print("Q1 is", Q1)
print("Q3 is", Q3)
print("lower_bound is", lower_bound)
print("upper_bound is", upper_bound)
print("Number of outliers for Raw_Conc:", num_outliers)
Q1 is 23.0 Q3 is 55.0 lower_bound is -25.0 upper_bound is 103.0 Number of outliers for Raw_Conc: 187
# Create the grouped boxplot
plt.figure(figsize=(20, 6))
sns.boxplot(data=df_6hour, x='Month', y='AQI', hue='Year', palette='Set1')
# Set the plot title and axis labels
plt.title(' Boxplot of AQI by Month and Year')
plt.xlabel('Month')
plt.ylabel('AQI')
# Show the legend
plt.legend(title='Year', bbox_to_anchor=(0.5, -0.15), loc='upper center', ncol=3)
# Display the plot
plt.show()
# Plot boxplot of AQI values for each year
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df_6hour, x='Year', y='AQI')
ax.set_xlabel('Year')
ax.set_ylabel('AQI')
ax.set_title('AQI Boxplot by Year')
plt.show()
# Plot boxplot of AQI values for each month
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df_6hour, x='Month', y='AQI')
ax.set_xlabel('Month')
ax.set_ylabel('AQI')
ax.set_title('AQI Boxplot by Month')
plt.show()
# Get the unique months and years
months = range(1, 13) # 12 months
years = df_6hour['Year'].unique() # Unique years from the 'Year' column in df_master
# Create empty lists for x_values and y_values
x_values = []
y_values = []
# Iterate over years and months
for year in years:
for month in months:
# Filter the DataFrame for the specific year and month
filtered_df = df_6hour[(df_6hour['Year'] == year) & (df_6hour['Month'] == month)]
# Get the AQI values for the filtered data
aqi_values = filtered_df['AQI'].tolist()
# Append the AQI values to y_values list
y_values.extend(aqi_values)
# Create a string representation of the month and year
month_year = f"{month}-{year}"
# Append the month-year string to x_values list for each AQI value in the filtered data
x_values.extend([month_year] * len(aqi_values))
plt.figure(figsize=(16,8))
# Create the scatter plot using seaborn
sns.scatterplot(x=x_values, y=y_values, hue=x_values, palette='Set1', s=100)
# Set tAttributeErrorhe plot title and axis labels
plt.title('AQI Scatter Plot')
plt.xlabel('Time')
plt.ylabel('AQI')
# Rotate x-axis labels for better readability
plt.xticks(rotation=90,ha='right')
plt.legend().set_visible(False)
# Display the plot
plt.show()
# Group data by month and calculate average raw concentration
monthly_avg = df_6hour.groupby('Month')['Raw_Conc'].mean()
# Create line graph
plt.plot(range(12), monthly_avg.values)
plt.xticks(range(12))
plt.xlabel('Month')
plt.ylabel('Average Raw Concentration')
plt.title('Average Raw Concentration by Month')
plt.grid(True)
plt.show()
# Convert 'Date_LT' to datetime format
df_6hour['Date_LT'] = pd.to_datetime(df_6hour['Date_LT'])
plt.figure(figsize=(20, 6))
plt.scatter(df_6hour.index, df_6hour['NowCast_Conc'])
# Set labels and title for the plot
plt.xlabel('Time')
plt.ylabel('NowCast_Conc')
plt.title('Scatter Plot of NowCast_Conc')
# Customize x-axis tick locations and labels
plt.xticks(df_6hour.index[::500], df_6hour['Date_LT'].dt.strftime('%m-%Y')[::500], rotation=45)
# Show the plot
plt.show()
plt.figure(figsize=(20, 6))
plt.scatter(df_6hour.index, df_6hour['AQI'])
# Set labels and title for the plot
plt.xlabel('Time')
plt.ylabel('AQI')
plt.title('Scatter Plot of AQI')
# Customize x-axis tick locations and labels
plt.xticks(df_6hour.index[::500], df_6hour['Date_LT'].dt.strftime('%m-%Y')[::500], rotation=45)
# Show the plot
plt.show()
plt.figure(figsize=(20, 6))
plt.scatter(df_6hour.index, df_6hour['Raw_Conc'])
# Set labels and title for the plot
plt.xlabel('Time')
plt.ylabel('Raw_Conc')
plt.title('Scatter Plot of Raw_Conc')
# Customize x-axis tick locations and labels
plt.xticks(df_6hour.index[::500], df_6hour['Date_LT'].dt.strftime('%m-%Y')[::500], rotation=45)
# Show the plot
plt.show()
# Group data by month and calculate average AQI
monthly_aqi = df_6hour.groupby('Month')['AQI'].mean()
# Create line graph
plt.plot(range(12), monthly_aqi.values)
plt.xticks(range(12))
plt.xlabel('Month')
plt.ylabel('Average AQI')
plt.title('Average AQI by Month')
# Show the plot
plt.show()
sns.set(rc={'figure.figsize':(10,5)})
plt.xticks(fontsize=12)
p=sns.distplot(df_6hour['NowCast_Conc'],color='Red')
p.axes.set_title("NowCast_Conc Distribution",fontsize=20)
/var/folders/57/lkbzvbp92nv6k9m7r10l7d3w0000gn/T/ipykernel_23970/2453233521.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 p=sns.distplot(df_6hour['NowCast_Conc'],color='Red')
Text(0.5, 1.0, 'NowCast_Conc Distribution')
df_6hour['NowCast_Conc'].median()
37.5
df_6hour['NowCast_Conc'].value_counts()
24.6 31
38.6 30
25.7 30
35.3 30
27.9 29
..
131.4 1
111.0 1
111.9 1
131.8 1
2.4 1
Name: NowCast_Conc, Length: 1083, dtype: int64
sns.set(rc={'figure.figsize':(10,5)})
plt.xticks(fontsize=12)
p=sns.distplot(df_6hour['AQI'],color='Red')
p.axes.set_title("AQI Values Distribution",fontsize=20)
/var/folders/57/lkbzvbp92nv6k9m7r10l7d3w0000gn/T/ipykernel_23970/2869661363.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 p=sns.distplot(df_6hour['AQI'],color='Red')
Text(0.5, 1.0, 'AQI Values Distribution')
df_6hour['AQI'].median()
106.0
df_6hour['AQI'].value_counts().head(15)
152 210 154 176 155 174 153 171 156 162 158 138 157 130 151 122 75 113 77 111 80 108 85 104 162 103 87 101 160 100 Name: AQI, dtype: int64
sns.set(rc={'figure.figsize':(10,5)})
plt.xticks(fontsize=12)
p=sns.distplot(df_6hour['Raw_Conc'],color='Red')
p.axes.set_title("Raw_Conc Values Distribution",fontsize=20)
/var/folders/57/lkbzvbp92nv6k9m7r10l7d3w0000gn/T/ipykernel_23970/269364611.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 p=sns.distplot(df_6hour['Raw_Conc'],color='Red')
Text(0.5, 1.0, 'Raw_Conc Values Distribution')
df_6hour['Raw_Conc'].median()
37.0
df_6hour['Raw_Conc'].value_counts()
27.0 223
17.0 207
38.0 203
24.0 200
48.0 191
...
146.0 1
160.0 1
158.0 1
175.0 1
128.0 1
Name: Raw_Conc, Length: 151, dtype: int64
# Histogram of AQI each month over years
# Get the unique months and years
months = range(1, 13) # 12 months
years = df_6hour['Year'].unique() # Unique years from the 'Year' column in df_6hour
# Set the number of subplots
num_subplots = len(years) * len(months)
# Set the number of columns and rows for subplots
num_cols = 4 # Number of columns of subplots
num_rows = -(-num_subplots // num_cols) # Round up division
# Set the height of each subplot
subplot_height = 5
# Calculate the figure height based on the number of rows and the desired subplot height
fig_height = num_rows * subplot_height
# Create a new figure and set the size
fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, fig_height))
# Flatten the axes array
axes = axes.flatten()
# Iterate over years and months
for i, (year, month) in enumerate([(year, month) for year in years for month in months]):
# Exclude January to October 2015
if year == 2015 and month < 11:
continue
# Filter the DataFrame for the specific year and month
filtered_df = df_6hour[(df_6hour['Year'] == year) & (df_6hour['Month'] == month)]
# Check if there is no data for the month
if filtered_df.empty:
continue
# Get the AQI values for the filtered data
aqi_values = filtered_df['AQI'].tolist()
# Create a string representation of the month and year
month_year = f"{month}-{year}"
# Create a histogram of AQI values for each month by year
sns.histplot(aqi_values, kde=True, ax=axes[i])
# Set the plot title and axis labels
axes[i].set_title(f"AQI Histogram - {month_year}")
axes[i].set_xlabel("AQI")
axes[i].set_ylabel("Frequency")
# Remove x-axis label for all but the last row
if i // num_cols != num_rows - 1:
axes[i].set_xlabel("")
sns.histplot(df_2016_6hr['AQI'], kde=True)
<Axes: xlabel='AQI', ylabel='Count'>
sns.histplot(df_2017_6hr['AQI'], kde=True)
<Axes: xlabel='AQI', ylabel='Count'>
sns.histplot(df_2018_6hr['AQI'], kde=True)
<Axes: xlabel='AQI', ylabel='Count'>
sns.histplot(df_2019_6hr['AQI'], kde=True)
<Axes: xlabel='AQI', ylabel='Count'>
df_6hour
| Site | Parameter | Date_LT | Year | Month | Day | Hour | NowCast_Conc | AQI | AQI_Category | Raw_Conc | Conc_Unit | Duration | QC_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Jakarta South | PM2.5 - Principal | 2015-11-21 02:00:00 | 2015 | 11 | 21 | 2 | 31.7 | 92 | Moderate | 32.0 | UG/M3 | 1 Hr | Valid |
| 1 | Jakarta South | PM2.5 - Principal | 2015-11-21 08:00:00 | 2015 | 11 | 21 | 8 | 54.2 | 147 | Unhealthy for Sensitive Groups | 58.0 | UG/M3 | 1 Hr | Valid |
| 2 | Jakarta South | PM2.5 - Principal | 2015-12-22 02:00:00 | 2015 | 12 | 22 | 2 | 23.0 | 74 | Moderate | 23.0 | UG/M3 | 1 Hr | Valid |
| 3 | Jakarta South | PM2.5 - Principal | 2015-12-22 08:00:00 | 2015 | 12 | 22 | 8 | 32.5 | 94 | Moderate | 34.0 | UG/M3 | 1 Hr | Valid |
| 4 | Jakarta South | PM2.5 - Principal | 2015-12-22 14:00:00 | 2015 | 12 | 22 | 14 | 11.7 | 49 | Good | 10.0 | UG/M3 | 1 Hr | Valid |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10247 | Jakarta South | PM2.5 - Principal | 2023-05-21 08:00:00 | 2023 | 5 | 21 | 8 | 77.3 | 162 | Unhealthy | 72.0 | UG/M3 | 1 Hr | Valid |
| 10248 | Jakarta South | PM2.5 - Principal | 2023-05-21 14:00:00 | 2023 | 5 | 21 | 14 | 48.1 | 132 | Unhealthy for Sensitive Groups | 46.0 | UG/M3 | 1 Hr | Valid |
| 10249 | Jakarta South | PM2.5 - Principal | 2023-05-21 20:00:00 | 2023 | 5 | 21 | 20 | 39.8 | 112 | Unhealthy for Sensitive Groups | 36.0 | UG/M3 | 1 Hr | Valid |
| 10250 | Jakarta South | PM2.5 - Principal | 2023-05-22 02:00:00 | 2023 | 5 | 22 | 2 | 39.2 | 110 | Unhealthy for Sensitive Groups | 43.0 | UG/M3 | 1 Hr | Valid |
| 10251 | Jakarta South | PM2.5 - Principal | 2023-05-22 08:00:00 | 2023 | 5 | 22 | 8 | 62.5 | 155 | Unhealthy | 64.0 | UG/M3 | 1 Hr | Valid |
10252 rows × 14 columns
plt.figure(figsize=(15, 6), dpi=100)
plt.grid()
# Extract the month from the 'Date_LT' column
df_6hour['Month'] = pd.to_datetime(df_6hour['Date_LT']).dt.month
# Plot the line graph
sns.lineplot(data=df_6hour, x='Month', y='AQI', hue='Year', ci=0, palette='Set1')
# Set the x-axis tick labels
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
# Set the title
plt.title('AQI Monthly Trend (by Year)')
# Show the plot
plt.show()
/var/folders/57/lkbzvbp92nv6k9m7r10l7d3w0000gn/T/ipykernel_23970/4083817283.py:8: FutureWarning:
The `ci` parameter is deprecated. Use `errorbar=('ci', 0)` for the same effect.
sns.lineplot(data=df_6hour, x='Month', y='AQI', hue='Year', ci=0, palette='Set1')
# Group the data by year and calculate the mean AQI
average_aqi_by_year = df_6hour.groupby(df_6hour['Year'])['AQI'].mean()
# Plot the graph
plt.plot(average_aqi_by_year.index, average_aqi_by_year.values, marker='o')
# Set the x-axis label
plt.xlabel('Year')
# Set the y-axis label
plt.ylabel('Average AQI')
# Set the title
plt.title('Trend of AQI over Years')
# Show the plot
plt.show()
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
jakarta_decompose = seasonal_decompose(df_6hour['AQI'], period=12)
sns.lineplot(x=df_6hour['Date_LT'], y=jakarta_decompose.trend)
<Axes: xlabel='Date_LT', ylabel='trend'>
jakarta_decompose2 = seasonal_decompose(df_6hour['NowCast_Conc'], period=12)
sns.lineplot(x=df_6hour['Date_LT'], y=jakarta_decompose2.trend)
<Axes: xlabel='Date_LT', ylabel='trend'>
jakarta_decompose3 = seasonal_decompose(df_6hour['Raw_Conc'], period=12)
sns.lineplot(x=df_6hour['Date_LT'], y=jakarta_decompose3.trend)
<Axes: xlabel='Date_LT', ylabel='trend'>
# Group the data by 'AQI_Category' and 'Year' and calculate the count
grouped_data = df_6hour.groupby(['AQI_Category', 'Year']).size().unstack()
# Plot the grouped bar chart
grouped_data.plot(kind='bar', rot=0)
# Set the y-axis label and chart title
plt.ylabel('Count')
plt.title('Grouped Bar Chart of AQI_Category by Year')
plt.xticks(rotation=45)
# Show the plot
plt.show()
# Extract year, month, and day from the 'Day' column
df_6hour['Year'] = df_6hour['Date_LT'].dt.year
df_6hour['Month'] = df_6hour['Date_LT'].dt.month
df_6hour['Day'] = df_6hour['Date_LT'].dt.day
# Create the pivot table to calculate the average AQI
pivot_table_6hour = pd.pivot_table(df_6hour, values='AQI', index=['Year', 'Month', 'Day'], aggfunc='mean').reset_index()
# Rename the column to 'Average AQI'
pivot_table_6hour.rename(columns={'AQI': 'Average AQI'}, inplace=True)
# Print the resulting dataset
print(pivot_table_6hour)
Year Month Day Average AQI 0 2015 11 21 119.50 1 2015 12 22 69.75 2 2015 12 23 81.75 3 2015 12 24 80.00 4 2015 12 25 98.75 ... ... ... ... ... 2609 2023 5 18 132.25 2610 2023 5 19 146.75 2611 2023 5 20 126.00 2612 2023 5 21 141.00 2613 2023 5 22 132.50 [2614 rows x 4 columns]
# Group the data by year and classify days as unhealthy or healthy
unhealthy_threshold = 100 # Adjust this threshold as needed
pivot_table_6hour['Day Classification'] = pivot_table_6hour['Average AQI'].apply(lambda x: 'Unhealthy' if x > unhealthy_threshold else 'Healthy')
day_classification = pivot_table_6hour.groupby(['Year', 'Day Classification']).size().unstack().fillna(0)
# Plot the graph
ax = day_classification.plot(kind='bar', width=0.8)
# Set the x-axis label
ax.set_xlabel('Year')
# Set the y-axis label
ax.set_ylabel('Number of Days')
# Set the title
ax.set_title('Number of Unhealthy and Healthy Days over Years')
# Set the x-tick labels
ax.set_xticklabels(day_classification.index, rotation=45)
# Create a legend
ax.legend(['Healthy', 'Unhealthy'])
# Display the data values for each bar
for container in ax.containers:
ax.bar_label(container, fmt='%d', label_type='edge', fontsize=8)
# Show the plot
plt.show()
season_jakarta_dict = {1: 'Wet', 2: 'Wet', 3: 'Wet',4: 'Wet',5: 'Wet',6: 'Dry',
7: 'Dry', 8: 'Dry', 9: 'Dry',10: 'Dry',11: 'Wet',12: 'Wet'}
df_6hour['Season'] = df_6hour['Month'].map(season_jakarta_dict)
df_6hour
| Site | Parameter | Date_LT | Year | Month | Day | Hour | NowCast_Conc | AQI | AQI_Category | Raw_Conc | Conc_Unit | Duration | QC_Name | Season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Jakarta South | PM2.5 - Principal | 2015-11-21 02:00:00 | 2015 | 11 | 21 | 2 | 31.7 | 92 | Moderate | 32.0 | UG/M3 | 1 Hr | Valid | Wet |
| 1 | Jakarta South | PM2.5 - Principal | 2015-11-21 08:00:00 | 2015 | 11 | 21 | 8 | 54.2 | 147 | Unhealthy for Sensitive Groups | 58.0 | UG/M3 | 1 Hr | Valid | Wet |
| 2 | Jakarta South | PM2.5 - Principal | 2015-12-22 02:00:00 | 2015 | 12 | 22 | 2 | 23.0 | 74 | Moderate | 23.0 | UG/M3 | 1 Hr | Valid | Wet |
| 3 | Jakarta South | PM2.5 - Principal | 2015-12-22 08:00:00 | 2015 | 12 | 22 | 8 | 32.5 | 94 | Moderate | 34.0 | UG/M3 | 1 Hr | Valid | Wet |
| 4 | Jakarta South | PM2.5 - Principal | 2015-12-22 14:00:00 | 2015 | 12 | 22 | 14 | 11.7 | 49 | Good | 10.0 | UG/M3 | 1 Hr | Valid | Wet |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10247 | Jakarta South | PM2.5 - Principal | 2023-05-21 08:00:00 | 2023 | 5 | 21 | 8 | 77.3 | 162 | Unhealthy | 72.0 | UG/M3 | 1 Hr | Valid | Wet |
| 10248 | Jakarta South | PM2.5 - Principal | 2023-05-21 14:00:00 | 2023 | 5 | 21 | 14 | 48.1 | 132 | Unhealthy for Sensitive Groups | 46.0 | UG/M3 | 1 Hr | Valid | Wet |
| 10249 | Jakarta South | PM2.5 - Principal | 2023-05-21 20:00:00 | 2023 | 5 | 21 | 20 | 39.8 | 112 | Unhealthy for Sensitive Groups | 36.0 | UG/M3 | 1 Hr | Valid | Wet |
| 10250 | Jakarta South | PM2.5 - Principal | 2023-05-22 02:00:00 | 2023 | 5 | 22 | 2 | 39.2 | 110 | Unhealthy for Sensitive Groups | 43.0 | UG/M3 | 1 Hr | Valid | Wet |
| 10251 | Jakarta South | PM2.5 - Principal | 2023-05-22 08:00:00 | 2023 | 5 | 22 | 8 | 62.5 | 155 | Unhealthy | 64.0 | UG/M3 | 1 Hr | Valid | Wet |
10252 rows × 15 columns
plt.figure(figsize=(15,6), dpi=100)
sns.lineplot(data=df_6hour, x='Date_LT', y='AQI',hue=df_6hour['Season'],palette='Set2');
plt.figure(figsize=(15, 6), dpi=100)
# Perform seasonal decomposition
jakarta_decompose = seasonal_decompose(df_6hour['AQI'], period=12)
# Plot the trend component
sns.lineplot(x=df_6hour['Date_LT'], y=jakarta_decompose.trend, label='Trend')
# Split the plot by season using different colors
seasons = df_6hour['Season'].unique()
colors = sns.color_palette('Set2', len(seasons))
for i, season in enumerate(seasons):
season_data = df_6hour[df_6hour['Season'] == season]
sns.lineplot(x=season_data['Date_LT'], y=jakarta_decompose.trend.loc[season_data.index], color=colors[i], label=season)
# Set the plot title and axis labels
plt.title('AQI Trend by Season')
plt.xlabel('Date')
plt.ylabel('AQI')
# Display the legend
plt.legend()
# Show the plot
plt.show()
season_year_mean_aqi = df_6hour.groupby(['Season', 'Year'])['AQI'].mean().unstack()
plt.figure(figsize=(10, 6))
sns.heatmap(season_year_mean_aqi, cmap='YlOrRd', annot=True, fmt='.2f', cbar=True)
plt.xlabel('Year')
plt.ylabel('Season')
plt.title('Mean AQI for Each Season Over Years in Jakarta South')
plt.show()
# Extract day of the week
df_6hour['DayOfWeek'] = df_6hour['Date_LT'].dt.dayofweek
# Group data by day of the week and calculate average AQI
avg_rawconc_by_day = df_6hour.groupby('DayOfWeek')['Raw_Conc'].mean().reset_index()
# Define the order of days of the week
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# Plot AQI values by days of the week
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(days_of_week, avg_rawconc_by_day['Raw_Conc'], marker='o')
ax.set_xlabel('Day of the Week')
ax.set_ylabel('Average Raw_Conc')
ax.set_title('Raw_Conc Value by Days of the Week')
plt.show()
# Extract day of the week
df_6hour['DayOfWeek'] = df_6hour['Date_LT'].dt.dayofweek
# Group data by day of the week and calculate average AQI
avg_aqi_by_day = df_6hour.groupby('DayOfWeek')['AQI'].mean().reset_index()
# Define the order of days of the week
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# Plot AQI values by days of the week
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(days_of_week, avg_aqi_by_day['AQI'], marker='o')
ax.set_xlabel('Day of the Week')
ax.set_ylabel('Average AQI')
ax.set_title('AQI Value by Days of the Week')
plt.show()
df_sampled = pd.DataFrame()
k = 4 # Number of records to sample per month per year
for year in df_6hour['Year'].unique():
for month in df_6hour['Month'].unique():
df_month = df_6hour[(df_6hour['Year'] == year) & (df_6hour['Month'] == month)]
if len(df_month) < k:
continue
step = len(df_month) // k # Calculate the step size
# Select every kth record using systematic sampling
sampled = df_month.iloc[::step][:k]
df_sampled = pd.concat([df_sampled, sampled])
df_sampled.reset_index(drop=True, inplace=True)
df_sampled.drop(columns=['DayOfWeek', 'Season'], inplace=True)
df_sampled
| Site | Parameter | Date_LT | Year | Month | Day | Hour | NowCast_Conc | AQI | AQI_Category | Raw_Conc | Conc_Unit | Duration | QC_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Jakarta South | PM2.5 - Principal | 2015-12-22 02:00:00 | 2015 | 12 | 22 | 2 | 23.0 | 74 | Moderate | 23.0 | UG/M3 | 1 Hr | Valid |
| 1 | Jakarta South | PM2.5 - Principal | 2015-12-24 14:00:00 | 2015 | 12 | 24 | 14 | 11.0 | 46 | Good | 8.0 | UG/M3 | 1 Hr | Valid |
| 2 | Jakarta South | PM2.5 - Principal | 2015-12-27 02:00:00 | 2015 | 12 | 27 | 2 | 36.2 | 103 | Unhealthy for Sensitive Groups | 50.0 | UG/M3 | 1 Hr | Valid |
| 3 | Jakarta South | PM2.5 - Principal | 2015-12-29 14:00:00 | 2015 | 12 | 29 | 14 | 21.5 | 71 | Moderate | 16.0 | UG/M3 | 1 Hr | Valid |
| 4 | Jakarta South | PM2.5 - Principal | 2016-11-01 02:00:00 | 2016 | 11 | 1 | 2 | 19.4 | 66 | Moderate | 13.0 | UG/M3 | 1 Hr | Valid |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 355 | Jakarta South | PM2.5 - Principal | 2023-04-23 02:00:00 | 2023 | 4 | 23 | 2 | 36.4 | 103 | Unhealthy for Sensitive Groups | 43.0 | UG/M3 | 1 Hr | Valid |
| 356 | Jakarta South | PM2.5 - Principal | 2023-05-01 02:00:00 | 2023 | 5 | 1 | 2 | 23.8 | 76 | Moderate | 26.0 | UG/M3 | 1 Hr | Valid |
| 357 | Jakarta South | PM2.5 - Principal | 2023-05-06 08:00:00 | 2023 | 5 | 6 | 8 | 28.5 | 85 | Moderate | 33.0 | UG/M3 | 1 Hr | Valid |
| 358 | Jakarta South | PM2.5 - Principal | 2023-05-11 14:00:00 | 2023 | 5 | 11 | 14 | 15.0 | 57 | Moderate | 10.0 | UG/M3 | 1 Hr | Valid |
| 359 | Jakarta South | PM2.5 - Principal | 2023-05-16 20:00:00 | 2023 | 5 | 16 | 20 | 26.0 | 80 | Moderate | 25.0 | UG/M3 | 1 Hr | Valid |
360 rows × 14 columns
# Save DataFrame to CSV file
df_sampled.to_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Sampled_JakartaSouth_PM2.5_6HR_YTD.csv', index=False)
df_sampled['Year'].unique()
array([2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])
df_sampled['Hour'].unique()
array([ 2, 14, 20, 8])
df_sampled['Month'].unique()
array([12, 11, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
df_sampled.describe()
| Year | Month | Day | Hour | NowCast_Conc | AQI | Raw_Conc | |
|---|---|---|---|---|---|---|---|
| count | 360.000000 | 360.000000 | 360.000000 | 360.000000 | 360.000000 | 360.000000 | 360.000000 |
| mean | 2019.177778 | 6.366667 | 12.122222 | 9.433333 | 42.200556 | 109.786111 | 42.569444 |
| std | 2.186705 | 3.502327 | 8.422450 | 6.846588 | 24.007838 | 40.773050 | 26.075435 |
| min | 2015.000000 | 1.000000 | 1.000000 | 2.000000 | 3.200000 | 13.000000 | 1.000000 |
| 25% | 2017.000000 | 3.000000 | 5.000000 | 2.000000 | 23.750000 | 75.750000 | 23.000000 |
| 50% | 2019.000000 | 6.000000 | 11.500000 | 8.000000 | 39.000000 | 110.000000 | 39.500000 |
| 75% | 2021.000000 | 9.000000 | 20.000000 | 14.000000 | 56.000000 | 151.000000 | 57.000000 |
| max | 2023.000000 | 12.000000 | 29.000000 | 20.000000 | 127.200000 | 188.000000 | 146.000000 |
plt.figure(figsize=(12, 6))
sns.countplot(data=df_sampled, x='AQI_Category', hue='Year', palette='Set1')
plt.xlabel('AQI Category')
plt.ylabel('Count')
plt.title('AQI Category Comparison Among All Years')
plt.legend(title='Year')
plt.show()
plt.figure(figsize=(12, 6))
sns.countplot(data=df_sampled, x='Month', hue='AQI_Category', palette='Set1')
plt.xlabel('Month')
plt.ylabel('Count')
plt.title('AQI Category Comparison of Each Month Over Years')
plt.legend(title='AQI Category')
plt.show()
# Create a pivot table to reshape the data
df_pivot = df_sampled.pivot_table(index=['Month', 'Day', 'Hour'], columns='Year', values='AQI', aggfunc=lambda x: x)
# Rename the columns to include a prefix or suffix indicating the year
df_pivot.columns = ['AQI_' + str(year) for year in df_pivot.columns]
# Reset the index and fill NaN values with blank
df_new = df_pivot.reset_index()
# Print the new DataFrame
df_new
| Month | Day | Hour | AQI_2015 | AQI_2016 | AQI_2017 | AQI_2018 | AQI_2019 | AQI_2020 | AQI_2021 | AQI_2022 | AQI_2023 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 2 | NaN | 165.0 | 152.0 | 163.0 | 48.0 | 136.0 | 108.0 | 118.0 | 87.0 |
| 1 | 1 | 5 | 8 | NaN | NaN | NaN | 56.0 | NaN | NaN | NaN | NaN | NaN |
| 2 | 1 | 7 | 8 | NaN | NaN | NaN | NaN | 143.0 | NaN | NaN | NaN | NaN |
| 3 | 1 | 8 | 20 | NaN | 63.0 | 92.0 | NaN | NaN | 162.0 | 81.0 | NaN | 65.0 |
| 4 | 1 | 9 | 8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 101.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 170 | 12 | 24 | 8 | NaN | NaN | NaN | 43.0 | NaN | 76.0 | 80.0 | 149.0 | NaN |
| 171 | 12 | 24 | 14 | 46.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 172 | 12 | 26 | 8 | NaN | NaN | 71.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 173 | 12 | 27 | 2 | 103.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 174 | 12 | 29 | 14 | 71.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
175 rows × 12 columns
df_new.describe()
| Month | Day | Hour | AQI_2015 | AQI_2016 | AQI_2017 | AQI_2018 | AQI_2019 | AQI_2020 | AQI_2021 | AQI_2022 | AQI_2023 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 175.000000 | 175.000000 | 175.000000 | 4.000000 | 48.000000 | 48.000000 | 48.000000 | 48.000000 | 48.000000 | 48.000000 | 48.000000 | 20.000000 |
| mean | 6.451429 | 14.274286 | 10.640000 | 73.500000 | 112.958333 | 92.062500 | 122.979167 | 131.708333 | 123.166667 | 102.812500 | 99.229167 | 77.650000 |
| std | 3.517883 | 7.537963 | 6.671401 | 23.330952 | 39.009523 | 41.764999 | 42.396050 | 36.030105 | 37.972181 | 40.998005 | 32.390726 | 23.978664 |
| min | 1.000000 | 1.000000 | 2.000000 | 46.000000 | 52.000000 | 18.000000 | 43.000000 | 36.000000 | 35.000000 | 13.000000 | 29.000000 | 51.000000 |
| 25% | 3.500000 | 8.000000 | 2.000000 | 64.750000 | 79.000000 | 60.750000 | 87.500000 | 109.500000 | 88.500000 | 74.000000 | 74.000000 | 63.500000 |
| 50% | 6.000000 | 15.000000 | 8.000000 | 72.500000 | 110.000000 | 85.500000 | 135.500000 | 147.000000 | 131.500000 | 104.000000 | 98.000000 | 72.000000 |
| 75% | 10.000000 | 22.000000 | 14.000000 | 81.250000 | 152.000000 | 120.750000 | 160.500000 | 155.250000 | 151.250000 | 134.500000 | 123.000000 | 85.500000 |
| max | 12.000000 | 29.000000 | 20.000000 | 103.000000 | 188.000000 | 187.000000 | 182.000000 | 183.000000 | 181.000000 | 187.000000 | 160.000000 | 158.000000 |
df_2015_new = df_new[["Month","AQI_2015"]].dropna().reset_index(drop=True)
df_2015_new
| Month | AQI_2015 | |
|---|---|---|
| 0 | 12 | 74.0 |
| 1 | 12 | 46.0 |
| 2 | 12 | 103.0 |
| 3 | 12 | 71.0 |
# Filter other AQI columns and create new dataframes
df_2016_new = df_new[["Month", "AQI_2016"]].dropna().reset_index(drop=True)
df_2017_new = df_new[["Month", "AQI_2017"]].dropna().reset_index(drop=True)
df_2018_new = df_new[["Month", "AQI_2018"]].dropna().reset_index(drop=True)
df_2019_new = df_new[["Month", "AQI_2019"]].dropna().reset_index(drop=True)
df_2020_new = df_new[["Month", "AQI_2020"]].dropna().reset_index(drop=True)
df_2021_new = df_new[["Month", "AQI_2021"]].dropna().reset_index(drop=True)
df_2022_new = df_new[["Month", "AQI_2022"]].dropna().reset_index(drop=True)
df_2023_new = df_new[["Month", "AQI_2023"]].dropna().reset_index(drop=True)
# Reset index of df_2015_new starting from 44
df_2015_new = df_2015_new.reset_index(drop=True)
df_2015_new.index = df_2015_new.index + 44
df_2015_new
| Month | AQI_2015 | |
|---|---|---|
| 44 | 12 | 74.0 |
| 45 | 12 | 46.0 |
| 46 | 12 | 103.0 |
| 47 | 12 | 71.0 |
df_2023_new
| Month | AQI_2023 | |
|---|---|---|
| 0 | 1 | 87.0 |
| 1 | 1 | 65.0 |
| 2 | 1 | 71.0 |
| 3 | 1 | 51.0 |
| 4 | 2 | 80.0 |
| 5 | 2 | 68.0 |
| 6 | 2 | 54.0 |
| 7 | 2 | 59.0 |
| 8 | 3 | 72.0 |
| 9 | 3 | 100.0 |
| 10 | 3 | 72.0 |
| 11 | 3 | 158.0 |
| 12 | 4 | 69.0 |
| 13 | 4 | 91.0 |
| 14 | 4 | 55.0 |
| 15 | 4 | 103.0 |
| 16 | 5 | 76.0 |
| 17 | 5 | 85.0 |
| 18 | 5 | 57.0 |
| 19 | 5 | 80.0 |
# Concatenate all 9 datasets by index
df_concatenated = pd.concat([df_2015_new['AQI_2015'], df_2016_new, df_2017_new["AQI_2017"], df_2018_new["AQI_2018"], df_2019_new["AQI_2019"], df_2020_new["AQI_2020"], df_2021_new["AQI_2021"], df_2022_new["AQI_2022"], df_2023_new["AQI_2023"]], axis=1)
# Display the concatenated dataframe
df_concatenated
| AQI_2015 | Month | AQI_2016 | AQI_2017 | AQI_2018 | AQI_2019 | AQI_2020 | AQI_2021 | AQI_2022 | AQI_2023 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 44 | 74.0 | 12 | 140.0 | 43.0 | 153.0 | 145.0 | 53.0 | 92.0 | 126.0 | NaN |
| 45 | 46.0 | 12 | 84.0 | 40.0 | 116.0 | 161.0 | 59.0 | 71.0 | 81.0 | NaN |
| 46 | 103.0 | 12 | 66.0 | 23.0 | 58.0 | 154.0 | 35.0 | 110.0 | 85.0 | NaN |
| 47 | 71.0 | 12 | 52.0 | 71.0 | 43.0 | 148.0 | 76.0 | 80.0 | 149.0 | NaN |
| 0 | NaN | 1 | 165.0 | 152.0 | 163.0 | 48.0 | 136.0 | 108.0 | 118.0 | 87.0 |
| 1 | NaN | 1 | 63.0 | 92.0 | 56.0 | 143.0 | 162.0 | 81.0 | 101.0 | 65.0 |
| 2 | NaN | 1 | 162.0 | 34.0 | 49.0 | 99.0 | 111.0 | 47.0 | 77.0 | 71.0 |
| 3 | NaN | 1 | 79.0 | 86.0 | 68.0 | 40.0 | 145.0 | 75.0 | 71.0 | 51.0 |
| 4 | NaN | 2 | 74.0 | 71.0 | 136.0 | 140.0 | 117.0 | 174.0 | 74.0 | 80.0 |
| 5 | NaN | 2 | 90.0 | 51.0 | 111.0 | 107.0 | 57.0 | 43.0 | 51.0 | 68.0 |
| 6 | NaN | 2 | 79.0 | 42.0 | 64.0 | 108.0 | 126.0 | 58.0 | 29.0 | 54.0 |
| 7 | NaN | 2 | 85.0 | 99.0 | 149.0 | 77.0 | 84.0 | 142.0 | 77.0 | 59.0 |
| 8 | NaN | 3 | 64.0 | 107.0 | 110.0 | 151.0 | 85.0 | 132.0 | 63.0 | 72.0 |
| 9 | NaN | 3 | 73.0 | 68.0 | 65.0 | 120.0 | 125.0 | 63.0 | 64.0 | 100.0 |
| 10 | NaN | 3 | 152.0 | 83.0 | 159.0 | 88.0 | 151.0 | 118.0 | 108.0 | 72.0 |
| 11 | NaN | 3 | 84.0 | 114.0 | 167.0 | 36.0 | 152.0 | 60.0 | 89.0 | 158.0 |
| 12 | NaN | 4 | 188.0 | 28.0 | 70.0 | 140.0 | 146.0 | 17.0 | 122.0 | 69.0 |
| 13 | NaN | 4 | 85.0 | 56.0 | 70.0 | 91.0 | 66.0 | 70.0 | 67.0 | 91.0 |
| 14 | NaN | 4 | 164.0 | 81.0 | 141.0 | 61.0 | 87.0 | 129.0 | 111.0 | 55.0 |
| 15 | NaN | 4 | 77.0 | 62.0 | 80.0 | 154.0 | 99.0 | 149.0 | 80.0 | 103.0 |
| 16 | NaN | 5 | 115.0 | 153.0 | 166.0 | 133.0 | 146.0 | 153.0 | 160.0 | 76.0 |
| 17 | NaN | 5 | 160.0 | 98.0 | 110.0 | 151.0 | 149.0 | 139.0 | 91.0 | 85.0 |
| 18 | NaN | 5 | 120.0 | 130.0 | 153.0 | 146.0 | 162.0 | 105.0 | 94.0 | 57.0 |
| 19 | NaN | 5 | 151.0 | 156.0 | 63.0 | 162.0 | 105.0 | 122.0 | 100.0 | 80.0 |
| 20 | NaN | 6 | 166.0 | 44.0 | 175.0 | 155.0 | 157.0 | 120.0 | 98.0 | NaN |
| 21 | NaN | 6 | 80.0 | 152.0 | 162.0 | 119.0 | 171.0 | 154.0 | 66.0 | NaN |
| 22 | NaN | 6 | 154.0 | 98.0 | 156.0 | 102.0 | 160.0 | 165.0 | 99.0 | NaN |
| 23 | NaN | 6 | 64.0 | 187.0 | 93.0 | 167.0 | 151.0 | 187.0 | 74.0 | NaN |
| 24 | NaN | 7 | 162.0 | 79.0 | 172.0 | 153.0 | 150.0 | 97.0 | 145.0 | NaN |
| 25 | NaN | 7 | 101.0 | 144.0 | 151.0 | 152.0 | 149.0 | 157.0 | 99.0 | NaN |
| 26 | NaN | 7 | 110.0 | 130.0 | 160.0 | 154.0 | 180.0 | 133.0 | 59.0 | NaN |
| 27 | NaN | 7 | 85.0 | 85.0 | 175.0 | 155.0 | 154.0 | 106.0 | 98.0 | NaN |
| 28 | NaN | 8 | 128.0 | 72.0 | 173.0 | 152.0 | 181.0 | 70.0 | 152.0 | NaN |
| 29 | NaN | 8 | 105.0 | 112.0 | 140.0 | 111.0 | 77.0 | 85.0 | 82.0 | NaN |
| 30 | NaN | 8 | 110.0 | 161.0 | 163.0 | 175.0 | 89.0 | 78.0 | 138.0 | NaN |
| 31 | NaN | 8 | 129.0 | 135.0 | 135.0 | 156.0 | 171.0 | 141.0 | 135.0 | NaN |
| 32 | NaN | 9 | 151.0 | 108.0 | 164.0 | 157.0 | 134.0 | 153.0 | 153.0 | NaN |
| 33 | NaN | 9 | 152.0 | 51.0 | 95.0 | 158.0 | 85.0 | 109.0 | 110.0 | NaN |
| 34 | NaN | 9 | 168.0 | 152.0 | 111.0 | 130.0 | 136.0 | 75.0 | 121.0 | NaN |
| 35 | NaN | 9 | 73.0 | 123.0 | 173.0 | 158.0 | 155.0 | 79.0 | 130.0 | NaN |
| 36 | NaN | 10 | 137.0 | 118.0 | 132.0 | 183.0 | 116.0 | 103.0 | 154.0 | NaN |
| 37 | NaN | 10 | 56.0 | 62.0 | 90.0 | 156.0 | 126.0 | 126.0 | 86.0 | NaN |
| 38 | NaN | 10 | 80.0 | 85.0 | 152.0 | 143.0 | 129.0 | 91.0 | 110.0 | NaN |
| 39 | NaN | 10 | 136.0 | 110.0 | 95.0 | 151.0 | 120.0 | 90.0 | 128.0 | NaN |
| 40 | NaN | 11 | 66.0 | 57.0 | 182.0 | 92.0 | 108.0 | 152.0 | 152.0 | NaN |
| 41 | NaN | 11 | 155.0 | 18.0 | 130.0 | 162.0 | 151.0 | 59.0 | 70.0 | NaN |
| 42 | NaN | 11 | 156.0 | 120.0 | 67.0 | 110.0 | 66.0 | 54.0 | 53.0 | NaN |
| 43 | NaN | 11 | 126.0 | 76.0 | 137.0 | 168.0 | 162.0 | 13.0 | 63.0 | NaN |
# Get the "Month" column
month_column = df_concatenated["Month"]
# Remove the "Month" column from the dataframe
df_concatenated = df_concatenated.drop("Month",axis=1)
# Insert the "Month" column at the first position
df_concatenated.insert(0, "Month", month_column)
# Reset the index
df_concatenated = df_concatenated.reset_index(drop=True)
# Display the updated dataframe
df_concatenated
| Month | AQI_2015 | AQI_2016 | AQI_2017 | AQI_2018 | AQI_2019 | AQI_2020 | AQI_2021 | AQI_2022 | AQI_2023 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12 | 74.0 | 140.0 | 43.0 | 153.0 | 145.0 | 53.0 | 92.0 | 126.0 | NaN |
| 1 | 12 | 46.0 | 84.0 | 40.0 | 116.0 | 161.0 | 59.0 | 71.0 | 81.0 | NaN |
| 2 | 12 | 103.0 | 66.0 | 23.0 | 58.0 | 154.0 | 35.0 | 110.0 | 85.0 | NaN |
| 3 | 12 | 71.0 | 52.0 | 71.0 | 43.0 | 148.0 | 76.0 | 80.0 | 149.0 | NaN |
| 4 | 1 | NaN | 165.0 | 152.0 | 163.0 | 48.0 | 136.0 | 108.0 | 118.0 | 87.0 |
| 5 | 1 | NaN | 63.0 | 92.0 | 56.0 | 143.0 | 162.0 | 81.0 | 101.0 | 65.0 |
| 6 | 1 | NaN | 162.0 | 34.0 | 49.0 | 99.0 | 111.0 | 47.0 | 77.0 | 71.0 |
| 7 | 1 | NaN | 79.0 | 86.0 | 68.0 | 40.0 | 145.0 | 75.0 | 71.0 | 51.0 |
| 8 | 2 | NaN | 74.0 | 71.0 | 136.0 | 140.0 | 117.0 | 174.0 | 74.0 | 80.0 |
| 9 | 2 | NaN | 90.0 | 51.0 | 111.0 | 107.0 | 57.0 | 43.0 | 51.0 | 68.0 |
| 10 | 2 | NaN | 79.0 | 42.0 | 64.0 | 108.0 | 126.0 | 58.0 | 29.0 | 54.0 |
| 11 | 2 | NaN | 85.0 | 99.0 | 149.0 | 77.0 | 84.0 | 142.0 | 77.0 | 59.0 |
| 12 | 3 | NaN | 64.0 | 107.0 | 110.0 | 151.0 | 85.0 | 132.0 | 63.0 | 72.0 |
| 13 | 3 | NaN | 73.0 | 68.0 | 65.0 | 120.0 | 125.0 | 63.0 | 64.0 | 100.0 |
| 14 | 3 | NaN | 152.0 | 83.0 | 159.0 | 88.0 | 151.0 | 118.0 | 108.0 | 72.0 |
| 15 | 3 | NaN | 84.0 | 114.0 | 167.0 | 36.0 | 152.0 | 60.0 | 89.0 | 158.0 |
| 16 | 4 | NaN | 188.0 | 28.0 | 70.0 | 140.0 | 146.0 | 17.0 | 122.0 | 69.0 |
| 17 | 4 | NaN | 85.0 | 56.0 | 70.0 | 91.0 | 66.0 | 70.0 | 67.0 | 91.0 |
| 18 | 4 | NaN | 164.0 | 81.0 | 141.0 | 61.0 | 87.0 | 129.0 | 111.0 | 55.0 |
| 19 | 4 | NaN | 77.0 | 62.0 | 80.0 | 154.0 | 99.0 | 149.0 | 80.0 | 103.0 |
| 20 | 5 | NaN | 115.0 | 153.0 | 166.0 | 133.0 | 146.0 | 153.0 | 160.0 | 76.0 |
| 21 | 5 | NaN | 160.0 | 98.0 | 110.0 | 151.0 | 149.0 | 139.0 | 91.0 | 85.0 |
| 22 | 5 | NaN | 120.0 | 130.0 | 153.0 | 146.0 | 162.0 | 105.0 | 94.0 | 57.0 |
| 23 | 5 | NaN | 151.0 | 156.0 | 63.0 | 162.0 | 105.0 | 122.0 | 100.0 | 80.0 |
| 24 | 6 | NaN | 166.0 | 44.0 | 175.0 | 155.0 | 157.0 | 120.0 | 98.0 | NaN |
| 25 | 6 | NaN | 80.0 | 152.0 | 162.0 | 119.0 | 171.0 | 154.0 | 66.0 | NaN |
| 26 | 6 | NaN | 154.0 | 98.0 | 156.0 | 102.0 | 160.0 | 165.0 | 99.0 | NaN |
| 27 | 6 | NaN | 64.0 | 187.0 | 93.0 | 167.0 | 151.0 | 187.0 | 74.0 | NaN |
| 28 | 7 | NaN | 162.0 | 79.0 | 172.0 | 153.0 | 150.0 | 97.0 | 145.0 | NaN |
| 29 | 7 | NaN | 101.0 | 144.0 | 151.0 | 152.0 | 149.0 | 157.0 | 99.0 | NaN |
| 30 | 7 | NaN | 110.0 | 130.0 | 160.0 | 154.0 | 180.0 | 133.0 | 59.0 | NaN |
| 31 | 7 | NaN | 85.0 | 85.0 | 175.0 | 155.0 | 154.0 | 106.0 | 98.0 | NaN |
| 32 | 8 | NaN | 128.0 | 72.0 | 173.0 | 152.0 | 181.0 | 70.0 | 152.0 | NaN |
| 33 | 8 | NaN | 105.0 | 112.0 | 140.0 | 111.0 | 77.0 | 85.0 | 82.0 | NaN |
| 34 | 8 | NaN | 110.0 | 161.0 | 163.0 | 175.0 | 89.0 | 78.0 | 138.0 | NaN |
| 35 | 8 | NaN | 129.0 | 135.0 | 135.0 | 156.0 | 171.0 | 141.0 | 135.0 | NaN |
| 36 | 9 | NaN | 151.0 | 108.0 | 164.0 | 157.0 | 134.0 | 153.0 | 153.0 | NaN |
| 37 | 9 | NaN | 152.0 | 51.0 | 95.0 | 158.0 | 85.0 | 109.0 | 110.0 | NaN |
| 38 | 9 | NaN | 168.0 | 152.0 | 111.0 | 130.0 | 136.0 | 75.0 | 121.0 | NaN |
| 39 | 9 | NaN | 73.0 | 123.0 | 173.0 | 158.0 | 155.0 | 79.0 | 130.0 | NaN |
| 40 | 10 | NaN | 137.0 | 118.0 | 132.0 | 183.0 | 116.0 | 103.0 | 154.0 | NaN |
| 41 | 10 | NaN | 56.0 | 62.0 | 90.0 | 156.0 | 126.0 | 126.0 | 86.0 | NaN |
| 42 | 10 | NaN | 80.0 | 85.0 | 152.0 | 143.0 | 129.0 | 91.0 | 110.0 | NaN |
| 43 | 10 | NaN | 136.0 | 110.0 | 95.0 | 151.0 | 120.0 | 90.0 | 128.0 | NaN |
| 44 | 11 | NaN | 66.0 | 57.0 | 182.0 | 92.0 | 108.0 | 152.0 | 152.0 | NaN |
| 45 | 11 | NaN | 155.0 | 18.0 | 130.0 | 162.0 | 151.0 | 59.0 | 70.0 | NaN |
| 46 | 11 | NaN | 156.0 | 120.0 | 67.0 | 110.0 | 66.0 | 54.0 | 53.0 | NaN |
| 47 | 11 | NaN | 126.0 | 76.0 | 137.0 | 168.0 | 162.0 | 13.0 | 63.0 | NaN |
# Replace NaN values with 0
df_concatenated = df_concatenated.fillna(0)
df_concatenated
| Month | AQI_2015 | AQI_2016 | AQI_2017 | AQI_2018 | AQI_2019 | AQI_2020 | AQI_2021 | AQI_2022 | AQI_2023 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12 | 74.0 | 140.0 | 43.0 | 153.0 | 145.0 | 53.0 | 92.0 | 126.0 | 0.0 |
| 1 | 12 | 46.0 | 84.0 | 40.0 | 116.0 | 161.0 | 59.0 | 71.0 | 81.0 | 0.0 |
| 2 | 12 | 103.0 | 66.0 | 23.0 | 58.0 | 154.0 | 35.0 | 110.0 | 85.0 | 0.0 |
| 3 | 12 | 71.0 | 52.0 | 71.0 | 43.0 | 148.0 | 76.0 | 80.0 | 149.0 | 0.0 |
| 4 | 1 | 0.0 | 165.0 | 152.0 | 163.0 | 48.0 | 136.0 | 108.0 | 118.0 | 87.0 |
| 5 | 1 | 0.0 | 63.0 | 92.0 | 56.0 | 143.0 | 162.0 | 81.0 | 101.0 | 65.0 |
| 6 | 1 | 0.0 | 162.0 | 34.0 | 49.0 | 99.0 | 111.0 | 47.0 | 77.0 | 71.0 |
| 7 | 1 | 0.0 | 79.0 | 86.0 | 68.0 | 40.0 | 145.0 | 75.0 | 71.0 | 51.0 |
| 8 | 2 | 0.0 | 74.0 | 71.0 | 136.0 | 140.0 | 117.0 | 174.0 | 74.0 | 80.0 |
| 9 | 2 | 0.0 | 90.0 | 51.0 | 111.0 | 107.0 | 57.0 | 43.0 | 51.0 | 68.0 |
| 10 | 2 | 0.0 | 79.0 | 42.0 | 64.0 | 108.0 | 126.0 | 58.0 | 29.0 | 54.0 |
| 11 | 2 | 0.0 | 85.0 | 99.0 | 149.0 | 77.0 | 84.0 | 142.0 | 77.0 | 59.0 |
| 12 | 3 | 0.0 | 64.0 | 107.0 | 110.0 | 151.0 | 85.0 | 132.0 | 63.0 | 72.0 |
| 13 | 3 | 0.0 | 73.0 | 68.0 | 65.0 | 120.0 | 125.0 | 63.0 | 64.0 | 100.0 |
| 14 | 3 | 0.0 | 152.0 | 83.0 | 159.0 | 88.0 | 151.0 | 118.0 | 108.0 | 72.0 |
| 15 | 3 | 0.0 | 84.0 | 114.0 | 167.0 | 36.0 | 152.0 | 60.0 | 89.0 | 158.0 |
| 16 | 4 | 0.0 | 188.0 | 28.0 | 70.0 | 140.0 | 146.0 | 17.0 | 122.0 | 69.0 |
| 17 | 4 | 0.0 | 85.0 | 56.0 | 70.0 | 91.0 | 66.0 | 70.0 | 67.0 | 91.0 |
| 18 | 4 | 0.0 | 164.0 | 81.0 | 141.0 | 61.0 | 87.0 | 129.0 | 111.0 | 55.0 |
| 19 | 4 | 0.0 | 77.0 | 62.0 | 80.0 | 154.0 | 99.0 | 149.0 | 80.0 | 103.0 |
| 20 | 5 | 0.0 | 115.0 | 153.0 | 166.0 | 133.0 | 146.0 | 153.0 | 160.0 | 76.0 |
| 21 | 5 | 0.0 | 160.0 | 98.0 | 110.0 | 151.0 | 149.0 | 139.0 | 91.0 | 85.0 |
| 22 | 5 | 0.0 | 120.0 | 130.0 | 153.0 | 146.0 | 162.0 | 105.0 | 94.0 | 57.0 |
| 23 | 5 | 0.0 | 151.0 | 156.0 | 63.0 | 162.0 | 105.0 | 122.0 | 100.0 | 80.0 |
| 24 | 6 | 0.0 | 166.0 | 44.0 | 175.0 | 155.0 | 157.0 | 120.0 | 98.0 | 0.0 |
| 25 | 6 | 0.0 | 80.0 | 152.0 | 162.0 | 119.0 | 171.0 | 154.0 | 66.0 | 0.0 |
| 26 | 6 | 0.0 | 154.0 | 98.0 | 156.0 | 102.0 | 160.0 | 165.0 | 99.0 | 0.0 |
| 27 | 6 | 0.0 | 64.0 | 187.0 | 93.0 | 167.0 | 151.0 | 187.0 | 74.0 | 0.0 |
| 28 | 7 | 0.0 | 162.0 | 79.0 | 172.0 | 153.0 | 150.0 | 97.0 | 145.0 | 0.0 |
| 29 | 7 | 0.0 | 101.0 | 144.0 | 151.0 | 152.0 | 149.0 | 157.0 | 99.0 | 0.0 |
| 30 | 7 | 0.0 | 110.0 | 130.0 | 160.0 | 154.0 | 180.0 | 133.0 | 59.0 | 0.0 |
| 31 | 7 | 0.0 | 85.0 | 85.0 | 175.0 | 155.0 | 154.0 | 106.0 | 98.0 | 0.0 |
| 32 | 8 | 0.0 | 128.0 | 72.0 | 173.0 | 152.0 | 181.0 | 70.0 | 152.0 | 0.0 |
| 33 | 8 | 0.0 | 105.0 | 112.0 | 140.0 | 111.0 | 77.0 | 85.0 | 82.0 | 0.0 |
| 34 | 8 | 0.0 | 110.0 | 161.0 | 163.0 | 175.0 | 89.0 | 78.0 | 138.0 | 0.0 |
| 35 | 8 | 0.0 | 129.0 | 135.0 | 135.0 | 156.0 | 171.0 | 141.0 | 135.0 | 0.0 |
| 36 | 9 | 0.0 | 151.0 | 108.0 | 164.0 | 157.0 | 134.0 | 153.0 | 153.0 | 0.0 |
| 37 | 9 | 0.0 | 152.0 | 51.0 | 95.0 | 158.0 | 85.0 | 109.0 | 110.0 | 0.0 |
| 38 | 9 | 0.0 | 168.0 | 152.0 | 111.0 | 130.0 | 136.0 | 75.0 | 121.0 | 0.0 |
| 39 | 9 | 0.0 | 73.0 | 123.0 | 173.0 | 158.0 | 155.0 | 79.0 | 130.0 | 0.0 |
| 40 | 10 | 0.0 | 137.0 | 118.0 | 132.0 | 183.0 | 116.0 | 103.0 | 154.0 | 0.0 |
| 41 | 10 | 0.0 | 56.0 | 62.0 | 90.0 | 156.0 | 126.0 | 126.0 | 86.0 | 0.0 |
| 42 | 10 | 0.0 | 80.0 | 85.0 | 152.0 | 143.0 | 129.0 | 91.0 | 110.0 | 0.0 |
| 43 | 10 | 0.0 | 136.0 | 110.0 | 95.0 | 151.0 | 120.0 | 90.0 | 128.0 | 0.0 |
| 44 | 11 | 0.0 | 66.0 | 57.0 | 182.0 | 92.0 | 108.0 | 152.0 | 152.0 | 0.0 |
| 45 | 11 | 0.0 | 155.0 | 18.0 | 130.0 | 162.0 | 151.0 | 59.0 | 70.0 | 0.0 |
| 46 | 11 | 0.0 | 156.0 | 120.0 | 67.0 | 110.0 | 66.0 | 54.0 | 53.0 | 0.0 |
| 47 | 11 | 0.0 | 126.0 | 76.0 | 137.0 | 168.0 | 162.0 | 13.0 | 63.0 | 0.0 |
corr_matrix = df_concatenated.corr()
corr_matrix
| Month | AQI_2015 | AQI_2016 | AQI_2017 | AQI_2018 | AQI_2019 | AQI_2020 | AQI_2021 | AQI_2022 | AQI_2023 | |
|---|---|---|---|---|---|---|---|---|---|---|
| Month | 1.000000 | 0.461729 | 0.027517 | -0.083456 | 0.143352 | 0.548952 | -0.196433 | -0.062703 | 0.355402 | -0.781821 |
| AQI_2015 | 0.461729 | 1.000000 | -0.216949 | -0.349050 | -0.246027 | 0.159908 | -0.539733 | -0.076364 | 0.098740 | -0.227872 |
| AQI_2016 | 0.027517 | -0.216949 | 1.000000 | 0.013218 | 0.169906 | -0.004429 | 0.179335 | -0.149311 | 0.284263 | -0.066636 |
| AQI_2017 | -0.083456 | -0.349050 | 0.013218 | 1.000000 | 0.287847 | 0.034356 | 0.335555 | 0.410284 | 0.197767 | -0.042916 |
| AQI_2018 | 0.143352 | -0.246027 | 0.169906 | 0.287847 | 1.000000 | -0.008166 | 0.395382 | 0.326390 | 0.386774 | -0.245813 |
| AQI_2019 | 0.548952 | 0.159908 | -0.004429 | 0.034356 | -0.008166 | 1.000000 | 0.078618 | 0.097288 | 0.222024 | -0.523639 |
| AQI_2020 | -0.196433 | -0.539733 | 0.179335 | 0.335555 | 0.395382 | 0.078618 | 1.000000 | 0.149633 | 0.141697 | -0.036097 |
| AQI_2021 | -0.062703 | -0.076364 | -0.149311 | 0.410284 | 0.326390 | 0.097288 | 0.149633 | 1.000000 | 0.165685 | -0.081594 |
| AQI_2022 | 0.355402 | 0.098740 | 0.284263 | 0.197767 | 0.386774 | 0.222024 | 0.141697 | 0.165685 | 1.000000 | -0.281181 |
| AQI_2023 | -0.781821 | -0.227872 | -0.066636 | -0.042916 | -0.245813 | -0.523639 | -0.036097 | -0.081594 | -0.281181 | 1.000000 |
#Plot the pairplot of AQI from one to other years
sns.pairplot(df_concatenated[['AQI_2016','AQI_2017','AQI_2018','AQI_2019','AQI_2020','AQI_2021','AQI_2022','AQI_2023']], kind='reg')
<seaborn.axisgrid.PairGrid at 0x29f9a44c0>
# Generate a correlation matrix for the AQI columns
corr_matrix = df_concatenated.iloc[:, 1:].corr()
# Plot a heatmap of the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
# Set the title of the plot
plt.title('Heatmap of AQI correlation among all years')
# Show the plot
plt.show()
plt.figure(figsize=(15, 6), dpi=100)
plt.grid(True)
sns.lineplot(data=df_sampled, x=df_sampled['Date_LT'], y="AQI", hue="Year", ci=0, palette='Set1')
plt.xticks(rotation=45)
plt.xlabel('Date')
plt.title('AQI Monthly for Each Year')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=3)
plt.show()
/var/folders/57/lkbzvbp92nv6k9m7r10l7d3w0000gn/T/ipykernel_23970/544724590.py:4: FutureWarning:
The `ci` parameter is deprecated. Use `errorbar=('ci', 0)` for the same effect.
sns.lineplot(data=df_sampled, x=df_sampled['Date_LT'], y="AQI", hue="Year", ci=0, palette='Set1')